Prerequisites

Loading the required packages

library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)

Dataset

Import processed data, which can be found here.

#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')

Get sample of dataset

#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))

Split Taster data into different Data Frame

tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters
<<<<<<< HEAD
======= >>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365

Drop taster_twitter_handle in wines dataframe

wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)
<<<<<<< HEAD
======= >>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365

Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points

taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)

Add Rating Classification

Add following classification to wine dataset as found on the website:

Category Rating Description
Classic 98-100 The pinnacle of quality.
Superb 94-97 A great achievement.
Excellent 90-93 Highly recommended.
Very Good 87-89 Often good value; well recommended.
Good 83-86 Suitable for everyday consumption; often good value.
Acceptable 80-82 Can be employed in casual, less-critical circumstances
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)

Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.

normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 

Data Sanitation

Vintage seems to have year 7200

wines <- wines %>%
  filter(vintage<2019)

Data Exploration

Univariate Exploration

Correlating price by points, using DataExplorer library which can be found here

plot_correlation(c('wines$price', 'wines$points'))

Alcohol Amount

wines %>% 
  group_by(alcohol) %>% 
  ggplot(mapping = aes(x = alcohol)) +
  geom_histogram(na.rm = T,
                 bins = 50) +
  scale_x_continuous(name = "Alchohol Percentage", breaks = seq(0,25,1), limits = c(0,25))

Category

wines %>% 
#  group_by(points) %>% 
#  count(category) %>%  
  ggplot() +
  facet_wrap(~ category) +
  geom_point(mapping = aes(x=points, y = price))

Vintage

Count wines per year (Note: Data has been sanitized)

wines %>%
  group_by(vintage) %>%
  summarize(count = n())
Grouping rowwise data frame strips rowwise nature
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage))

Winery

To better understand the number wines per winery, we did a univarite visualization that counts the number of wines per winery showing only 15 winerys to give you an idea what winery has the most selction of wines.

<<<<<<< HEAD
wines %>%
  group_by(winery) %>%==
Error: unexpected '==' in:
"wines %>%
  group_by(winery) %>%=="
=======
wines %>%
  group_by(winery) %>%==
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:15) %>%
  ggplot() +
  geom_col(mapping = aes(x=count, y =winery)) 
>>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365

Province

To better understand the number wines per province, we did a univarite visualization that counts the number of wines per province showing only the top 10 provinces with the most wines. This can give the reader an idea where their wine will most likely be made with California standing out as a clear leader.

wines %>% 
  group_by(province) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count)) %>% 
  slice(1:10) %>% 
  ggplot()+
  geom_col(aes(x = count, y = province))

Price

Calculating the Mean, Standard Deviation, Minimum, and Max Price for the entire wine dataset and printing the values.

mean_price <- print(mean(wines$price, na.rm = TRUE))
sd_price <- print(sd(wines$price, na.rm = TRUE))
min_price <- print(min(wines$price, na.rm = TRUE))
max_price <- print(max(wines$price, na.rm = TRUE))

Points

Calculating the Mean, Standard Deviation, Minimum, and Max Points for the entire wine dataset and printing the values.

print(mean(wines$points))
print(sd(wines$points))
print(min(wines$points))
print(max(wines$points))

To help you understand the point distribution by reviewers, we did a multivarite visualization that coorelates some taster names based on the average wine points as identified by the x-intercept. This give you the reader an idea of how some reviewers correlate to the overall average.

wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))

Multivariate Exploration

Price by Points

Notice the data is “stacked” and the socres range from 80-100

TODO: IZZY (Why did we log this?)

Data Analysis

To help you understand the data analysis, we found the best province for wine by using the average points across the wines.

<<<<<<< HEAD
mean_points <- mean(wines$points)
best_province <- wines %>% 
  summarise(province) %>% 
=======

mean_points <- mean(wines$points)
mean_points

best_province <- wines %>% 
  group_by(points) %>% 
>>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365
  filter(points > mean_points) %>% 
  arrange(desc(points)) %>%  
  ggplot() +
  geom_col(mapping = aes(x= province, y = points)) 
<<<<<<< HEAD
Error in points > mean_points : 
  comparison (6) is possible only for atomic and list types
======= >>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365

Best wine, by variety

#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
NA
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)

Conclusion

<<<<<<< HEAD
---
title: "Exploring and Analyizing Wine Enthusiast Reviews"
output: html_notebook
---

# Prerequisites

Loading the required packages
```{r, message=FALSE, warning=FALSE}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
```

# Dataset

Import processed data, which can be found [here](https://github.com/C4rbyn3m4n/wine_reviews_data_analysis/blob/master/data/processed_data/preprocessing.rmd).

```{r}
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
```

Get sample of dataset
```{r}
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
```

### Split Taster data into different Data Frame

```{r}
tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters
```

Drop `taster_twitter_handle` in wines dataframe

```{r}
wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)
```
## Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a "profile" for each reviewer which includes characteristics like: `avg_points`, `sd_points`, and `var_points`
```{r}
taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
```
### Add Rating Classification

Add following classification to wine dataset as found on the [website](https://www.winemag.com/2010/04/09/you-asked-how-is-a-wines-score-determined/):

|Category  | Rating  | Description                                            |
|----------|---------|--------------------------------------------------------|
|Classic   |	98-100 | The pinnacle of quality.                               |
|Superb    |	94-97	 | A great achievement.                                   |
|Excellent |	90-93	 | Highly recommended.                                    |
|Very Good |  87-89	 | Often good value; well recommended.                    |
|Good	     |  83-86	 | Suitable for everyday consumption; often good value.   |
|Acceptable|	80-82	 | Can be employed in casual, less-critical circumstances |

```{r}
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)
```

## Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, `norm_points`, by looking at the number of standard deviatioins a wine is from the reviewer's `avg_points`. This gives use a more accurate representation of which which wines are better than the rest.

```{r}
normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 
```

## Data Sanitation

Vintage seems to have year 7200
``` {r}
wines <- wines %>%
  filter(vintage<2019)
```
# Data Exploration

## Univariate Exploration
Correlating `price` by `points`, using ```DataExplorer``` library which can be found [here](https://datascienceplus.com/blazing-fast-eda-in-r-with-dataexplorer/)
```{r}
# TODO: IZZY
```

### Alcohol Amount
```{r}
# TODO: IZZY
```

### Category
```{r}
# TODO: IZZY
```

### Vintage
Count wines per year (Note: Data has been sanitized)
```{r}
wines %>%
  group_by(vintage) %>%
  summarize(count = n())
```


```{r}
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage))
```

### Winery
To better understand the number wines per winery, we did a univarite visualization that counts the number of wines per winery showing only 15 winerys to give you an idea what winery has the most selction of wines.
```{r}
wines %>%
  group_by(winery) %>%==
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:15) %>%
  ggplot() +
  geom_col(mapping = aes(x=count, y =winery)) 
```

### Province
To better understand the number wines per province, we did a univarite visualization that counts the number of wines per province showing only the top 10 provinces with the most wines. This can give the reader an idea where their wine will most likely be made with California standing out as a clear leader.
```{r}
wines %>% 
  group_by(province) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count)) %>% 
  slice(1:10) %>% 
  ggplot()+
  geom_col(aes(x = count, y = province))
```

### Price
Calculating the Mean, Standard Deviation, Minimum, and Max Price for the entire wine dataset and printing the values.
```{r}
mean_price <- print(mean(wines$price, na.rm = TRUE))
sd_price <- print(sd(wines$price, na.rm = TRUE))
min_price <- print(min(wines$price, na.rm = TRUE))
max_price <- print(max(wines$price, na.rm = TRUE))
```

###  Points 
Calculating the Mean, Standard Deviation, Minimum, and Max Points for the entire wine dataset and printing the values.
```{r}
print(mean(wines$points))
print(sd(wines$points))
print(min(wines$points))
print(max(wines$points))

```



To help you understand the point distribution by reviewers, we did a multivarite visualization that coorelates some taster names based on the average wine points as identified  by the x-intercept. This give you the reader an idea of how some reviewers correlate to the overall average.
```{r}
wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))
```

## Multivariate Exploration

## Price by Points
Notice the data is "stacked" and the socres range from 80-100
```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")
```

TODO: IZZY (Why did we log this?)

```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
```

# Data Analysis
To help you understand the data analysis, we found the best province for wine by using the average points across the wines. 
```{r}
mean_points <- mean(wines$points)
best_province <- wines %>% 
  summarise(province) %>% 
  filter(points > mean_points) %>% 
  arrange(desc(points)) %>%  
  ggplot() +
  geom_col(mapping = aes(x= province, y = points)) 
best_province
```


Best wine, by variety
```{r}
#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
```

```{r}
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)
```


# Conclusion

=======
---
title: "Exploring and Analyizing Wine Enthusiast Reviews"
output: html_notebook
---

# Prerequisites

Loading the required packages
```{r, message=FALSE, warning=FALSE}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
```

# Dataset

Import processed data, which can be found [here](https://github.com/C4rbyn3m4n/wine_reviews_data_analysis/blob/master/data/processed_data/preprocessing.rmd).

```{r}
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
```

Get sample of dataset
```{r}
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
```

### Split Taster data into different Data Frame

```{r}
tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters
```

Drop `taster_twitter_handle` in wines dataframe

```{r}
wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)
```
## Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a "profile" for each reviewer which includes characteristics like: `avg_points`, `sd_points`, and `var_points`
```{r}
taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
```
### Add Rating Classification

Add following classification to wine dataset as found on the [website](https://www.winemag.com/2010/04/09/you-asked-how-is-a-wines-score-determined/):

|Category  | Rating  | Description                                            |
|----------|---------|--------------------------------------------------------|
|Classic   |	98-100 | The pinnacle of quality.                               |
|Superb    |	94-97	 | A great achievement.                                   |
|Excellent |	90-93	 | Highly recommended.                                    |
|Very Good |  87-89	 | Often good value; well recommended.                    |
|Good	     |  83-86	 | Suitable for everyday consumption; often good value.   |
|Acceptable|	80-82	 | Can be employed in casual, less-critical circumstances |

```{r}
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)
```

## Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, `norm_points`, by looking at the number of standard deviatioins a wine is from the reviewer's `avg_points`. This gives use a more accurate representation of which which wines are better than the rest.

```{r}
normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 
```

## Data Sanitation

Vintage seems to have year 7200
``` {r}
wines <- wines %>%
  filter(vintage<2019)
```
# Data Exploration

## Univariate Exploration
Correlating `price` by `points`, using ```DataExplorer``` library which can be found [here](https://datascienceplus.com/blazing-fast-eda-in-r-with-dataexplorer/)
```{r}
plot_correlation(c('wines$price', 'wines$points'))
```

### Alcohol Amount
```{r}
wines %>% 
  group_by(alcohol) %>% 
  ggplot(mapping = aes(x = alcohol)) +
  geom_histogram(na.rm = T,
                 bins = 50) +
  scale_x_continuous(name = "Alchohol Percentage", breaks = seq(0,25,1), limits = c(0,25))
```

### Category
```{r}
wines %>% 
#  group_by(points) %>% 
#  count(category) %>%  
  ggplot() +
  facet_wrap(~ category) +
  geom_point(mapping = aes(x=points, y = price))
```

### Vintage
Count wines per year (Note: Data has been sanitized)
```{r}
wines %>%
  group_by(vintage) %>%
  summarize(count = n())
```


```{r}
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage))
```

### Winery
To better understand the number wines per winery, we did a univarite visualization that counts the number of wines per winery showing only 15 winerys to give you an idea what winery has the most selction of wines.
```{r}
wines %>%
  group_by(winery) %>%==
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:15) %>%
  ggplot() +
  geom_col(mapping = aes(x=count, y =winery)) 
```

### Province
To better understand the number wines per province, we did a univarite visualization that counts the number of wines per province showing only the top 10 provinces with the most wines. This can give the reader an idea where their wine will most likely be made with California standing out as a clear leader.
```{r}
wines %>% 
  group_by(province) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count)) %>% 
  slice(1:10) %>% 
  ggplot()+
  geom_col(aes(x = count, y = province))
```

### Price
Calculating the Mean, Standard Deviation, Minimum, and Max Price for the entire wine dataset and printing the values.
```{r}
mean_price <- print(mean(wines$price, na.rm = TRUE))
sd_price <- print(sd(wines$price, na.rm = TRUE))
min_price <- print(min(wines$price, na.rm = TRUE))
max_price <- print(max(wines$price, na.rm = TRUE))
```

###  Points 
Calculating the Mean, Standard Deviation, Minimum, and Max Points for the entire wine dataset and printing the values.
```{r}
print(mean(wines$points))
print(sd(wines$points))
print(min(wines$points))
print(max(wines$points))

```



To help you understand the point distribution by reviewers, we did a multivarite visualization that coorelates some taster names based on the average wine points as identified  by the x-intercept. This give you the reader an idea of how some reviewers correlate to the overall average.
```{r}
wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))
```

## Multivariate Exploration

## Price by Points
Notice the data is "stacked" and the socres range from 80-100
```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")
```

TODO: IZZY (Why did we log this?)

```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
```

# Data Analysis

#Find the best province for wine using the average points across the dataset
#drop the descriptions or just select price? set points to max(points)
```{r}
mean_points <- mean(wines$points)
mean_points

best_province <- wines %>% 
  group_by(points) %>% 
  filter(points > mean_points) %>% 
  arrange(desc(points))
best_province
```


Best wine, by variety
```{r}
#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
```

```{r}
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)
```


# Conclusion

>>>>>>> 5eefb62a423836334020d43ec36a96579d6cc365